In [1]:
from pymongo import MongoClient
from sets import Set
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
client = MongoClient('10.123.14.47')
db = client.charts
#bb200
charts = db.crawled_charts
records = charts.find({"chart_name_id":125})
client.close()
artists = []
artist_names = []
days = []
for rec in records:
if rec.has_key("artist_id"):
artists.append(rec['artist_id'])
artist_names.append(rec['artist_name'])
days.append(rec['day'])
client.close()
bbappearances = pd.DataFrame({'id':artists, 'name':artist_names, 'appearance_day':days})
bb_range = bbappearances.groupby(['id','name'])['appearance_day'].agg({'min' : np.min, 'max' : np.max})
In [2]:
# 16071 is 1/1/2014, 16436 is 1/1/2015
bb_range['appeared_before_2014'] = bb_range['min']<16071
bb_range['appeared_in_2014'] = (bb_range['max']>=16071) & (bb_range['max']<16436)
bb_range.reset_index(inplace=True)
bb_range.head()
Out[2]:
In [3]:
from nbsutils import imp
query_b200 = """
select entity_id, value as val_310 from idx_entity idx
where metric_id=310
and unix_seconds=1388534400
and count_type='d'
order by value desc
limit 3000
"""
query_other = """
select entity_id, avg(value) as val_{0} from idx_entity idx
where metric_id={0}
and unix_seconds<1388534400
and unix_seconds>=1380585600
and count_type='d'
group by entity_id
order by avg(value) desc
limit 3000
"""
db = imp.Connection()
data = db.fetchAll(query_b200) # bb200 values on 1/1/2014
db.close()
In [4]:
data_all = data.copy()
db = imp.Connection()
for mid in [11,28,42,44,256,41]:
data_other = db.fetchAll(query_other.format(mid))
data_all = pd.merge(data_all,data_other,on='entity_id',how='outer') # merging in other values from 1/1/2014
db.close()
In [7]:
bbrange_slim = bb_range[['id','appeared_before_2014','appeared_in_2014','min']]
bbrange_slim.columns = ['entity_id', 'appeared_before_2014', 'appeared_in_2014','min']
# merging metric data with chart appearance data
data2 = pd.merge(data_all.copy(), bbrange_slim, on='entity_id', how='left')
data2.loc[:,'appeared_before_2014'] = data2['appeared_before_2014'].fillna(False) # fill in missing values
data2.loc[:,'appeared_in_2014'] = data2['appeared_in_2014'].fillna(False)
data_never_charted = data2.loc[np.logical_not(data2['appeared_before_2014']),:]
In [6]:
for mid in [310,11,28,42,44,256,41]:
hits_in_1000 = np.sum(data_never_charted.sort('val_{}'.format(mid), ascending=False)['appeared_in_2014'].head(1000))
hits_in_100 = np.sum(data_never_charted.sort('val_{}'.format(mid), ascending=False)['appeared_in_2014'].head(100))
hits_in_50 = np.sum(data_never_charted.sort('val_{}'.format(mid), ascending=False)['appeared_in_2014'].head(50))
print "{}: {} hits in 100, {} hits in 1000, {} hits in 50".format(mid,hits_in_100,hits_in_1000,hits_in_50)
In [38]:
from matplotlib import pyplot as plt
days_to_chart = (data_never_charted.sort('val_310'.format(mid), ascending=False).head(300).loc[data_never_charted['appeared_in_2014'],'min']-16071).tolist()
plt.hist(days_to_chart,bins=15)
plt.show()
In [41]:
print "Of the top 500 artists, average days to hit chart"
for mid in [310,11,28,42,44,256,41]:
days_to_chart = (data_never_charted.sort('val_{}'.format(mid), ascending=False).head(500).loc[data_never_charted['appeared_in_2014'],'min']-16071).tolist()
print "{}: average days = {}".format(mid,np.mean(days_to_chart))
In [ ]: